QC:
# Compute percent.mt for each sample:
C1_YF[["percent.mt"]] <- PercentageFeatureSet(C1_YF, pattern = "^MT-")
C1_ZY[["percent.mt"]] <- PercentageFeatureSet(C1_ZY, pattern = "^MT-")
C2_YF[["percent.mt"]] <- PercentageFeatureSet(C2_YF, pattern = "^MT-")
C2_ZC[["percent.mt"]] <- PercentageFeatureSet(C2_ZC, pattern = "^MT-")
C2_ZY[["percent.mt"]] <- PercentageFeatureSet(C2_ZY, pattern = "^MT-")
C3_YF[["percent.mt"]] <- PercentageFeatureSet(C3_YF, pattern = "^MT-")
C3_ZY[["percent.mt"]] <- PercentageFeatureSet(C3_ZY, pattern = "^MT-")
C4_ZY[["percent.mt"]] <- PercentageFeatureSet(C4_ZY, pattern = "^MT-")
Visualize QC metrics as violin plots for each sample:
VlnPlot(C1_YF, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

VlnPlot(C1_ZY, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(C2_YF, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(C2_ZC, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(C2_ZY, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(C3_YF, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(C3_ZY, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

VlnPlot(C4_ZY, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

An integrated violin plot:
merged <- merge(
C1_YF, #Anchor obj
y = list(C1_ZY, C2_YF, C2_ZC, C2_ZY, C3_YF, C3_ZY, C4_ZY),
add.cell.ids = c("C1_YF","C1_ZY","C2_YF","C2_ZC","C2_ZY","C3_YF","C3_ZY","C4_ZY")
)
VlnPlot(merged,
features = c("nFeature_RNA","nCount_RNA","percent.mt"),
group.by = "orig.ident",
pt.size = 0,
ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

# Scatter plots to inspect feature-feature relationships:
plot1 <- FeatureScatter(merged, "nCount_RNA", "nFeature_RNA")
plot2 <- FeatureScatter(merged, "nCount_RNA", "percent.mt")
plot1 + plot2

Summary table of samples before filtering:
samples <- list(
C1_YF = C1_YF, C1_ZY = C1_ZY, C2_YF = C2_YF, C2_ZC = C2_ZC,
C2_ZY = C2_ZY, C3_YF = C3_YF, C3_ZY = C3_ZY, C4_ZY = C4_ZY
)
summary_before <- data.frame(
sample = names(samples),
cells = sapply(samples, ncol),
genes = sapply(samples, nrow)
)
print(summary_before)
## sample cells genes
## C1_YF C1_YF 21481 21640
## C1_ZY C1_ZY 13567 21572
## C2_YF C2_YF 11717 21715
## C2_ZC C2_ZC 6605 18492
## C2_ZY C2_ZY 9352 22523
## C3_YF C3_YF 9169 22668
## C3_ZY C3_ZY 8532 22970
## C4_ZY C4_ZY 1516 16470
Filter cells by QC thresholds:
# keep cells with >200 and percent.mt <20
filtered_samples <- lapply(
samples,
function(obj) subset(obj, subset = nFeature_RNA > 200 & percent.mt < 20)
)
## Summary table after filtering:
summary_after <- data.frame(
sample = names(filtered_samples),
cells = sapply(filtered_samples, ncol),
genes_detected = sapply(filtered_samples, function(obj){
counts <- GetAssayData(obj, assay="RNA", slot="counts")
sum(rowSums(counts > 0) > 0)
}
))
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(summary_after)
## sample cells genes_detected
## C1_YF C1_YF 8160 21482
## C1_ZY C1_ZY 8497 21511
## C2_YF C2_YF 11348 21715
## C2_ZC C2_ZC 6398 18492
## C2_ZY C2_ZY 9100 22523
## C3_YF C3_YF 8176 22666
## C3_ZY C3_ZY 7808 22969
## C4_ZY C4_ZY 1406 16469
Log-normalize & find HVGs
normalized_samples <- lapply(filtered_samples, function(obj) {
NormalizeData(obj, normalization.method="LogNormalize", scale.factor=1e4) %>%
FindVariableFeatures(selection.method="vst", nfeatures=2000)
})
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
## Normalizing layer: counts
## Finding variable features for layer counts
Example on first sample
obj1 <- normalized_samples[[1]]
top10 <- head(VariableFeatures(obj1), 10)
bot10 <- tail(VariableFeatures(obj1), 10)
print(top10, caption="Top 10 highly variable genes")
## [1] "SPP1" "JCHAIN" "IGHA1" "APOE" "IGKC" "IGLC2" "APOC1" "CXCL14"
## [9] "MMP12" "COL1A1"
print(bot10, caption="Bottom 10 least variable genes")
## [1] "ZNF521" "FA2H" "OIP5" "HIST1H4H" "TSPAN1"
## [6] "ASNS" "ETS2" "LRRK2" "CD300E" "AP001160.2"
p1 <- VariableFeaturePlot(obj1)
p2 <- LabelPoints(p1, points=top10, repel=TRUE, xnudge=0, ynudge=0)
p3 <- LabelPoints(p1, points=bot10, repel=TRUE, xnudge=0, ynudge=0)
print(p1)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

print(p2, caption="Top10-HVG")
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

print(p3, caption="Least variable genes")
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

Scale data and PCA:
pca_samples <- lapply(normalized_samples, function(obj) {
obj %>%
ScaleData(vars.to.regress=c("nCount_RNA","percent.mt")) %>% ##Regressed out count and percent data as described in the publication.
RunPCA(features=VariableFeatures(object=obj), verbose=FALSE)
})
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
Determine optimal PCs and an optimal pK:
pc_nums <- lapply(pca_samples, function(obj) {
sdev <- obj[["pca"]]@stdev
pct <- sdev / sum(sdev) * 100
cum_pct <- cumsum(pct)
c1 <- which(cum_pct > 90 & pct < 5)[1]
c2 <- sort(which(diff(pct) > 0.1), decreasing=TRUE)[1] + 1
min(c(c1,c2), length(sdev))
})
pk_values <- 0.09
Doublet Finder:
## Tried to use DoubletFinder, but didn't work out [package bugs :(] . So using scDblFinder:
library(scDblFinder)
library(SingleCellExperiment)
sce_list <- lapply(pca_samples, function(seu){
# Convert THIS Seurat object to an SCE
sce <- as.SingleCellExperiment(seu)
# Run scDblFinder
sce <- scDblFinder(sce)
# Return the annotated SCE
sce
})
## Creating ~6528 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1083 cells excluded from training.
## iter=1, 1240 cells excluded from training.
## iter=2, 1267 cells excluded from training.
## Threshold found:0.473
## 909 (11.1%) doublets called
## Creating ~6798 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1252 cells excluded from training.
## iter=1, 1455 cells excluded from training.
## iter=2, 1506 cells excluded from training.
## Threshold found:0.527
## 1094 (12.9%) doublets called
## Creating ~9079 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1748 cells excluded from training.
## iter=1, 1839 cells excluded from training.
## iter=2, 1846 cells excluded from training.
## Threshold found:0.414
## 1367 (12%) doublets called
## Creating ~5119 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 589 cells excluded from training.
## iter=1, 520 cells excluded from training.
## iter=2, 437 cells excluded from training.
## Threshold found:0.39
## 350 (5.5%) doublets called
## Creating ~7280 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1371 cells excluded from training.
## iter=1, 1461 cells excluded from training.
## iter=2, 1511 cells excluded from training.
## Threshold found:0.419
## 1158 (12.7%) doublets called
## Creating ~6541 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1330 cells excluded from training.
## iter=1, 1487 cells excluded from training.
## iter=2, 1451 cells excluded from training.
## Threshold found:0.409
## 962 (11.8%) doublets called
## Creating ~6247 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1433 cells excluded from training.
## iter=1, 1511 cells excluded from training.
## iter=2, 1512 cells excluded from training.
## Threshold found:0.437
## 900 (11.5%) doublets called
## Creating ~1500 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 150 cells excluded from training.
## iter=1, 155 cells excluded from training.
## iter=2, 157 cells excluded from training.
## Threshold found:0.671
## 68 (4.8%) doublets called
# Now each element of sce_list has scDblFinder.score & .class
# To inspect sample 1:
table(sce_list[[1]]$scDblFinder.class)
##
## singlet doublet
## 7251 909
# Tabulate:
doublet_summary <- lapply(names(sce_list), function(samp) {
sce <- sce_list[[samp]]
tbl <- table(sce$scDblFinder.class)
data.frame(
sample = samp,
class = names(tbl),
count = as.integer(tbl),
fraction = as.numeric(prop.table(tbl)),
row.names = NULL
)
}) %>% bind_rows()
print(doublet_summary)
## sample class count fraction
## 1 C1_YF singlet 7251 0.88860294
## 2 C1_YF doublet 909 0.11139706
## 3 C1_ZY singlet 7403 0.87124868
## 4 C1_ZY doublet 1094 0.12875132
## 5 C2_YF singlet 9981 0.87953824
## 6 C2_YF doublet 1367 0.12046176
## 7 C2_ZC singlet 6048 0.94529540
## 8 C2_ZC doublet 350 0.05470460
## 9 C2_ZY singlet 7942 0.87274725
## 10 C2_ZY doublet 1158 0.12725275
## 11 C3_YF singlet 7214 0.88233855
## 12 C3_YF doublet 962 0.11766145
## 13 C3_ZY singlet 6908 0.88473361
## 14 C3_ZY doublet 900 0.11526639
## 15 C4_ZY singlet 1338 0.95163585
## 16 C4_ZY doublet 68 0.04836415
Cluster and visualize:
obj_with_clusters <- lapply(pca_samples, function(obj) {
obj <- FindNeighbors(obj, dims = 1:20)
obj <- FindClusters(obj, resolution = 0.5)
obj <- RunUMAP(obj, dims = 1:20)
return(obj)
})
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8160
## Number of edges: 258430
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8670
## Number of communities: 14
## Elapsed time: 0 seconds
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 22:03:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:03:26 Read 8160 rows and found 20 numeric columns
## 22:03:26 Using Annoy for neighbor search, n_neighbors = 30
## 22:03:26 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:03:27 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b16988e1de
## 22:03:27 Searching Annoy index using 1 thread, search_k = 3000
## 22:03:29 Annoy recall = 100%
## 22:03:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:03:31 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:03:32 Commencing optimization for 500 epochs, with 336312 positive edges
## 22:03:40 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8497
## Number of edges: 297356
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8987
## Number of communities: 16
## Elapsed time: 0 seconds
## 22:03:43 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:03:43 Read 8497 rows and found 20 numeric columns
## 22:03:43 Using Annoy for neighbor search, n_neighbors = 30
## 22:03:43 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:03:44 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b15e7fca72
## 22:03:44 Searching Annoy index using 1 thread, search_k = 3000
## 22:03:46 Annoy recall = 100%
## 22:03:47 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:03:49 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:03:49 Commencing optimization for 500 epochs, with 358890 positive edges
## 22:03:57 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11348
## Number of edges: 407997
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9005
## Number of communities: 16
## Elapsed time: 1 seconds
## 22:04:02 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:04:02 Read 11348 rows and found 20 numeric columns
## 22:04:02 Using Annoy for neighbor search, n_neighbors = 30
## 22:04:02 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:04:03 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b1224dbc4a
## 22:04:03 Searching Annoy index using 1 thread, search_k = 3000
## 22:04:06 Annoy recall = 100%
## 22:04:07 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:04:09 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:04:09 Commencing optimization for 200 epochs, with 500540 positive edges
## 22:04:14 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6398
## Number of edges: 246149
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9200
## Number of communities: 15
## Elapsed time: 0 seconds
## 22:04:16 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:04:16 Read 6398 rows and found 20 numeric columns
## 22:04:16 Using Annoy for neighbor search, n_neighbors = 30
## 22:04:16 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:04:17 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b11c8f2c37
## 22:04:17 Searching Annoy index using 1 thread, search_k = 3000
## 22:04:19 Annoy recall = 100%
## 22:04:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:04:22 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:04:22 Commencing optimization for 500 epochs, with 287608 positive edges
## 22:04:28 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9100
## Number of edges: 328099
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8920
## Number of communities: 15
## Elapsed time: 1 seconds
## 22:04:32 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:04:32 Read 9100 rows and found 20 numeric columns
## 22:04:32 Using Annoy for neighbor search, n_neighbors = 30
## 22:04:32 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:04:33 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b12d079897
## 22:04:33 Searching Annoy index using 1 thread, search_k = 3000
## 22:04:35 Annoy recall = 100%
## 22:04:36 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:04:38 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:04:38 Commencing optimization for 500 epochs, with 397244 positive edges
## 22:04:47 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8176
## Number of edges: 287924
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8976
## Number of communities: 13
## Elapsed time: 0 seconds
## 22:04:50 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:04:50 Read 8176 rows and found 20 numeric columns
## 22:04:50 Using Annoy for neighbor search, n_neighbors = 30
## 22:04:50 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:04:51 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b113ae7624
## 22:04:51 Searching Annoy index using 1 thread, search_k = 3000
## 22:04:53 Annoy recall = 100%
## 22:04:54 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:04:56 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:04:56 Commencing optimization for 500 epochs, with 355170 positive edges
## 22:05:05 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 7808
## Number of edges: 278673
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8986
## Number of communities: 11
## Elapsed time: 0 seconds
## 22:05:07 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:05:07 Read 7808 rows and found 20 numeric columns
## 22:05:07 Using Annoy for neighbor search, n_neighbors = 30
## 22:05:07 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:05:08 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b15e736194
## 22:05:08 Searching Annoy index using 1 thread, search_k = 3000
## 22:05:10 Annoy recall = 100%
## 22:05:11 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:05:13 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:05:13 Commencing optimization for 500 epochs, with 330388 positive edges
## 22:05:21 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1406
## Number of edges: 46431
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8657
## Number of communities: 10
## Elapsed time: 0 seconds
## 22:05:21 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:05:21 Read 1406 rows and found 20 numeric columns
## 22:05:21 Using Annoy for neighbor search, n_neighbors = 30
## 22:05:21 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:05:22 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b172f23141
## 22:05:22 Searching Annoy index using 1 thread, search_k = 3000
## 22:05:22 Annoy recall = 100%
## 22:05:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:05:24 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:05:24 Commencing optimization for 500 epochs, with 54588 positive edges
## 22:05:26 Optimization finished
# Visualize UMAP and clusters per sample
lapply(names(obj_with_clusters), function(nm) {
obj <- obj_with_clusters[[nm]]
print(DimPlot(obj, reduction = "umap", label = TRUE) +
ggtitle(paste0("UMAP - Clusters: ", nm)))
})








## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

# Count cells per cluster and per sample
cluster_summary <- lapply(names(obj_with_clusters), function(nm) {
obj <- obj_with_clusters[[nm]]
tab <- table(Idents(obj))
data.frame(sample = nm, cluster = names(tab), count = as.integer(tab))
}) %>% bind_rows()
print(cluster_summary)
## sample cluster count
## 1 C1_YF 0 1813
## 2 C1_YF 1 1729
## 3 C1_YF 2 1532
## 4 C1_YF 3 549
## 5 C1_YF 4 520
## 6 C1_YF 5 442
## 7 C1_YF 6 388
## 8 C1_YF 7 361
## 9 C1_YF 8 333
## 10 C1_YF 9 152
## 11 C1_YF 10 126
## 12 C1_YF 11 110
## 13 C1_YF 12 71
## 14 C1_YF 13 34
## 15 C1_ZY 0 1422
## 16 C1_ZY 1 1411
## 17 C1_ZY 2 1024
## 18 C1_ZY 3 881
## 19 C1_ZY 4 855
## 20 C1_ZY 5 603
## 21 C1_ZY 6 518
## 22 C1_ZY 7 444
## 23 C1_ZY 8 433
## 24 C1_ZY 9 382
## 25 C1_ZY 10 153
## 26 C1_ZY 11 95
## 27 C1_ZY 12 93
## 28 C1_ZY 13 84
## 29 C1_ZY 14 69
## 30 C1_ZY 15 30
## 31 C2_YF 0 2416
## 32 C2_YF 1 2243
## 33 C2_YF 2 1792
## 34 C2_YF 3 1360
## 35 C2_YF 4 819
## 36 C2_YF 5 501
## 37 C2_YF 6 447
## 38 C2_YF 7 396
## 39 C2_YF 8 312
## 40 C2_YF 9 304
## 41 C2_YF 10 207
## 42 C2_YF 11 155
## 43 C2_YF 12 149
## 44 C2_YF 13 145
## 45 C2_YF 14 87
## 46 C2_YF 15 15
## 47 C2_ZC 0 1387
## 48 C2_ZC 1 831
## 49 C2_ZC 2 671
## 50 C2_ZC 3 609
## 51 C2_ZC 4 442
## 52 C2_ZC 5 422
## 53 C2_ZC 6 350
## 54 C2_ZC 7 304
## 55 C2_ZC 8 279
## 56 C2_ZC 9 263
## 57 C2_ZC 10 244
## 58 C2_ZC 11 239
## 59 C2_ZC 12 209
## 60 C2_ZC 13 82
## 61 C2_ZC 14 66
## 62 C2_ZY 0 3158
## 63 C2_ZY 1 1584
## 64 C2_ZY 2 1085
## 65 C2_ZY 3 912
## 66 C2_ZY 4 500
## 67 C2_ZY 5 334
## 68 C2_ZY 6 307
## 69 C2_ZY 7 272
## 70 C2_ZY 8 207
## 71 C2_ZY 9 175
## 72 C2_ZY 10 135
## 73 C2_ZY 11 133
## 74 C2_ZY 12 112
## 75 C2_ZY 13 97
## 76 C2_ZY 14 89
## 77 C3_YF 0 2593
## 78 C3_YF 1 1524
## 79 C3_YF 2 1075
## 80 C3_YF 3 504
## 81 C3_YF 4 488
## 82 C3_YF 5 385
## 83 C3_YF 6 374
## 84 C3_YF 7 328
## 85 C3_YF 8 296
## 86 C3_YF 9 290
## 87 C3_YF 10 184
## 88 C3_YF 11 86
## 89 C3_YF 12 49
## 90 C3_ZY 0 1861
## 91 C3_ZY 1 1663
## 92 C3_ZY 2 1143
## 93 C3_ZY 3 821
## 94 C3_ZY 4 780
## 95 C3_ZY 5 577
## 96 C3_ZY 6 467
## 97 C3_ZY 7 233
## 98 C3_ZY 8 104
## 99 C3_ZY 9 87
## 100 C3_ZY 10 72
## 101 C4_ZY 0 443
## 102 C4_ZY 1 305
## 103 C4_ZY 2 238
## 104 C4_ZY 3 91
## 105 C4_ZY 4 72
## 106 C4_ZY 5 70
## 107 C4_ZY 6 61
## 108 C4_ZY 7 48
## 109 C4_ZY 8 40
## 110 C4_ZY 9 38
Identify marker genes per cluster (example on first object):
obj1 <- obj_with_clusters[[1]]
markers_obj1 <- FindAllMarkers(obj1, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
top5_markers <- markers_obj1 %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
print(top5_markers)
## # A tibble: 70 × 7
## # Groups: cluster [14]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 5.63e-202 0.980 0.934 0.595 1.22e-197 0 SULT1C2
## 2 1.09e- 97 1.18 0.437 0.189 2.36e- 93 0 GSTA1
## 3 1.22e- 70 0.996 0.912 0.648 2.64e- 66 0 NTS
## 4 4.94e- 69 1.00 0.366 0.171 1.07e- 64 0 REG4
## 5 2.25e- 61 1.10 0.272 0.111 4.87e- 57 0 PNPLA3
## 6 1.51e-201 1.63 0.667 0.308 3.26e-197 1 EDN1
## 7 4.19e-197 1.39 0.828 0.502 9.06e-193 1 GDF15
## 8 2.94e-153 1.75 0.521 0.222 6.36e-149 1 EGR4
## 9 3.99e-127 1.56 0.629 0.357 8.63e-123 1 KRT17
## 10 4.38e-126 1.69 0.486 0.216 9.47e-122 1 MMP7
## # ℹ 60 more rows
# 5. Visualize marker expression for key clusters (example with top 3 clusters by size)
top_clusters <- cluster_summary %>%
filter(sample == names(obj_with_clusters)[1]) %>%
top_n(3, count) %>%
pull(cluster) %>%
unique()
for (cl in top_clusters) {
top_genes <- markers_obj1 %>% filter(cluster == cl) %>% top_n(5, avg_log2FC) %>% pull(gene)
print(FeaturePlot(obj1, features = top_genes, reduction = "umap") +
ggtitle(paste("Cluster", cl, "Top Genes")))
}



# 6. Automated annotation (example with SingleR, using Blueprint+ENCODE as ref)
ref <- BlueprintEncodeData()
sce <- as.SingleCellExperiment(obj1)
preds <- SingleR(test = sce, ref = ref, labels = ref$label.main)
# Add labels to Seurat object
obj1$SingleR_label <- preds$labels
# Plot UMAP with labels
DimPlot(obj1, group.by = "SingleR_label", label = TRUE, reduction = "umap")

#manual_labels <- c(
# "Adipocytes" , # 0: SULT1C2, GSTA1, NTS, REG4, PNPLA3
#"Inflammatory Epithelium", # 1: EDN1, GDF15, EGR4, KRT17, MMP7
#"CEA+ Tumor Epithelium", # 2: CEACAM5, ADGRF1, HRASLS2, PSORS1C2, FGFBP1
# "CD8+ Cytotoxic T Cells", # 3: IFNG, CD3G, GZMH, CD8B, CD8A
# "Myofibroblasts / CAF", # 4: ASPN, HTRA3, OMD, VGLL3, LRRC15
# "Myeloid / Monocyte-like", # 5: FPR3, FCGR1A, CLEC5A, CSF2RA, LINC02345
# "Proliferating Epithelium (S)", # 6: DTL, MCM10, MND1, CDC45, EXO1
# "NK / γδ T Cells", # 7: TNFRSF18, GNLY, TMIGD2, TRDC, KRT81
# "Cycling Cells (G2/M)", # 8: CCNB1, CDC20, CCNB2, CENPA, NEK2
# "Mast Cells", # 9: CPA3, TPSAB1, TPSB2, MS4A2, HDC
# "Unannotated", # 10
# "Unannotated", # 11
# "Unannotated", # 12
# "Unannotated" # 13
#)
# Name the labels with character cluster identities
#names(manual_labels) <- as.character(0:13)
# Assign to metadata
#obj1$manual_label <- manual_labels[as.character(Idents(obj1))]
# Plot with UMAP
#DimPlot(obj1, group.by = "manual_label", label = TRUE, reduction = "umap") +
# ggtitle("Manual Annotation of Cell Types")
# Merge Normalized Samples:
merged_all <- merge(
x = normalized_samples[[1]],
y = normalized_samples[-1],
add.cell.ids = names(normalized_samples),
project = "MergedProject"
)
# Scale and PCA:
merged_all <- ScaleData(merged_all, vars.to.regress = c("nCount_RNA", "percent.mt"))
## Regressing out nCount_RNA, percent.mt
## Centering and scaling data matrix
merged_all <- RunPCA(merged_all, features = VariableFeatures(merged_all))
## PC_ 1
## Positive: EPCAM, MUC1, AGR2, S100A14, SFTA2, LGALS4, DSG2, TSPAN1, TSPAN8, KRT8
## FXYD3, SLPI, ELF3, KRT19, TFF2, CTSE, GPRC5A, CLDN18, SLC44A4, MAL2
## CLDN4, SMIM22, AGR3, ITGB4, NQO1, TFF1, C19orf33, TFF3, CEACAM6, GPX2
## Negative: TNFAIP3, TYROBP, RGS1, PLIN2, ALOX5AP, C1QC, MS4A7, C1QA, C1QB, AIF1
## FCGR3A, MRC1, GPNMB, FCER1G, MSR1, RGCC, GPR183, FCGR2A, CD163, CTSS
## MS4A4A, GLUL, CYBB, HLA-DQA1, APOE, OLR1, CD14, LAIR1, PLXDC2, VSIG4
## PC_ 2
## Positive: GZMA, IL7R, ISG20, GZMK, FKBP11, IL2RB, RARRES3, TIGIT, GZMB, CCND2
## PMEPA1, IFNG, FBXO32, ICOS, GNLY, BATF, CD27, FKBP10, TNFRSF18, TNFAIP3
## XCL2, BGN, CAV1, COL5A1, CTLA4, RGS16, AEBP1, DCBLD2, ENAH, KLK7
## Negative: CD68, C1QA, MS4A7, CTSB, MRC1, C1QC, C1QB, AIF1, MSR1, FTL
## CTSD, FCER1G, CYBB, HLA-DRA, TYROBP, MS4A4A, CD14, APOE, HLA-DRB1, APOC1
## CAPG, LYZ, GPNMB, FCGR3A, CD163, FCGR2A, VSIG4, TREM2, NPC2, FBP1
## PC_ 3
## Positive: ISG20, IL7R, S100A4, TNFAIP3, GZMA, TXNIP, TFF1, LYZ, ALOX5AP, RGS1
## CCL4, TFF2, TFF3, AGR2, GZMK, GCHFR, PHGR1, PLAC8, CTSE, GPX2
## ZG16B, TMEM238, AREG, BCL2A1, RAB11FIP1, CLDN18, HLA-DRB1, AKR7A3, HPGD, C15orf48
## Negative: SPARC, IGFBP7, BGN, AEBP1, COL6A2, COL5A1, FSTL1, MYL9, TAGLN, NNMT
## TIMP3, TPM2, CNN3, C11orf96, COL4A2, COL6A1, COL4A1, RARRES2, CTGF, CTSK
## GPX8, CYR61, FN1, MGP, VCAN, TCF4, INHBA, LAMA4, IGFBP4, COX7A1
## PC_ 4
## Positive: MKI67, BIRC5, TOP2A, UBE2C, TK1, TPX2, CDKN3, PCLAF, GTSE1, ASPM
## DLGAP5, CCNB2, TYMS, RRM2, NUSAP1, CDK1, ZWINT, CENPF, HMMR, NUF2
## CMBL, CEP55, CCNB1, UBE2T, DTYMK, NTS, CDC20, DIAPH3, KIF4A, MAD2L1
## Negative: MMP7, KLK7, KLK11, LY6K, PRSS2, EEF1A2, SERPINA1, LCN2, PRSS22, TM4SF1
## SPINK1, DEFB1, PRSS1, KRT7, COL17A1, VSTM2L, SPRR3, IGKC, IGHG4, LIF
## IGHA1, IGLC2, CD24, LAMC2, SPRR1B, FAM129B, WNT7B, LAMB3, MT1G, TRIM29
## PC_ 5
## Positive: MKI67, TOP2A, UBE2C, BIRC5, ASPM, TPX2, PCLAF, HMMR, DLGAP5, GTSE1
## ANLN, CENPF, CDK1, RRM2, CCNB2, MMP7, TYMS, NUSAP1, NUF2, CEP55
## CCNA2, SPC25, MAD2L1, STMN1, KIF23, KIF4A, NCAPG, ZWINT, CDKN3, HJURP
## Negative: MMP23B, TM4SF5, JSRP1, MUC3A, AC005550.2, CMBL, FBXO2, PHGR1, HEPH, VSIG2
## CLDN18, NTS, TFF1, ADIRF, LINC01133, AKR7A3, UCA1, ZG16B, TFF3, TXNIP
## MUCL3, PTGDS, TFF2, SDR16C5, HSPA1A, TSPAN1, STARD10, CAPS, BCAS1, S100A4
# Add Sample Metadata (for Harmony):
merged_all$sample <- merged_all$orig.ident
# Run Harmony Integration
merged_all <- RunHarmony(merged_all, group.by.vars = "sample")
## Transposing data matrix
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations
merged_all <- FindNeighbors(merged_all, reduction = "harmony", dims = 1:25)
## Computing nearest neighbor graph
## Computing SNN
merged_all <- FindClusters(merged_all, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 60893
## Number of edges: 2039231
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9146
## Number of communities: 19
## Elapsed time: 18 seconds
merged_all <- RunUMAP(merged_all, reduction = "harmony", dims = 1:25)
## 22:23:40 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:23:40 Read 60893 rows and found 25 numeric columns
## 22:23:40 Using Annoy for neighbor search, n_neighbors = 30
## 22:23:40 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:23:47 Writing NN index file to temp file /scratch/5516304.1.academic/RtmpTRHoBg/file2da0b16c6e4a7c
## 22:23:47 Searching Annoy index using 1 thread, search_k = 3000
## 22:24:09 Annoy recall = 100%
## 22:24:10 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:24:14 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:24:17 Commencing optimization for 200 epochs, with 2780682 positive edges
## 22:24:41 Optimization finished
# Visualize Integrated UMAP
DimPlot(merged_all, group.by = "sample", reduction = "umap", label = TRUE) +
ggtitle("UMAP after Harmony Integration")

DimPlot(merged_all, group.by = "seurat_clusters", reduction = "umap", label = TRUE) +
ggtitle("Clusters after Harmony")

# Fix for Seurat v5 multi-layer assay
merged_all <- JoinLayers(merged_all)
# Pseudotime Analysis with Slingshot:
sce_harmony <- as.SingleCellExperiment(merged_all)
sce_harmony <- slingshot(sce_harmony, clusterLabels = 'seurat_clusters', reducedDim = 'UMAP')
# Plot Pseudotime Trajectory:
plot(
reducedDims(sce_harmony)$UMAP,
col = rainbow(100)[cut(sce_harmony$slingPseudotime_1, 100)],
pch = 16, cex = 0.5,
main = "Pseudotime Trajectory (Slingshot)"
)
centers <- aggregate(
reducedDims(sce_harmony)$UMAP,
by = list(cluster = sce_harmony$seurat_clusters),
FUN = mean
)
text(centers[,2], centers[,3], labels = centers$cluster, font = 2)

# Cell Type Proportion Analysis
# Extract metadata as a clean dataframe
meta_df <- merged_all@meta.data
meta_df$seurat_clusters <- Idents(merged_all)
meta_df$group <- merged_all$sample
# Plot cell type proportions per sample:
meta_df %>%
dplyr::count(group, seurat_clusters) %>%
group_by(group) %>%
mutate(percentage = n / sum(n) * 100) %>%
ggplot(aes(x = as.factor(seurat_clusters), y = percentage, fill = group)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
ylab("Percentage of cells") +
xlab("Cluster") +
ggtitle("Cell Type Proportions per Cluster by Sample") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# --- Replicating Vignette Figures: Fig1d and Fig2a ---
# -----------------------------
# Fig1d: DotPlot of marker genes across clusters
# -----------------------------
# Ensure identity class is correctly set
Idents(obj1) <- factor(obj1$seurat_clusters)
# Use only marker genes present in dataset
top5_genes <- intersect(unique(top5_markers$gene), rownames(obj1))
if (length(top5_genes) == 0) stop("No valid marker genes found in obj1")
# Compute average expression across clusters
avg <- AggregateExpression(obj1, features = top5_genes, assays = "RNA")$RNA
## First group.by variable `ident` starts with a number, appending `g` to ensure valid variable names
## This message is displayed once every 8 hours.
# Z-score transformation for heatmap clustering
sdat <- t(scale(t(avg[top5_genes, ])))
phtm <- pheatmap::pheatmap(sdat, cluster_rows = FALSE, silent = TRUE)
# Derive cluster order from heatmap
cluster_order <- gsub("^g", "", phtm$tree_col$labels[phtm$tree_col$order])
Idents(obj1) <- factor(obj1$seurat_clusters, levels = cluster_order)
# Reorder genes by peak cluster expression
mxid <- factor(colnames(sdat)[apply(sdat, 1, which.max)], levels = cluster_order)
gord <- rownames(sdat)[order(mxid)]
# Generate DotPlot
fig1d <- DotPlot(obj1, features = gord) + coord_flip() +
scale_color_gradientn(colours = rev(brewer.pal(9, "RdYlBu")), guide = "colourbar") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
print(fig1d)

# -----------------------------
# Fig2a: UMAPs colored by marker expression and cluster identity
# -----------------------------
# Create marker flag based on EPCAM and MKI67 expression
epcam_expr <- GetAssayData(obj1, assay = "RNA", slot = "data")["EPCAM", ]
mki67_expr <- GetAssayData(obj1, assay = "RNA", slot = "data")["MKI67", ]
obj1$flag <- ifelse(epcam_expr > 0 & mki67_expr > 0, "EPCAM+MKI67+",
ifelse(epcam_expr > 0, "EPCAM+",
ifelse(mki67_expr > 0, "MKI67+", "Other")))
obj1$flag <- factor(obj1$flag, levels = c("Other", "MKI67+", "EPCAM+MKI67+", "EPCAM+"))
# Extract UMAP coordinates with metadata
umap_data <- data.frame(
Embeddings(obj1, "umap"),
cluster = as.character(Idents(obj1)),
marker = obj1$flag,
group = obj1$orig.ident
)
colnames(umap_data)[1:2] <- c("x", "y")
# Define color schemes
mkcols <- c("#222222", "#F3C300", "#875692", "#F38400")
ecols <- colorRampPalette(brewer.pal(8, "Set2"))(length(unique(umap_data$cluster)))
groupCols <- brewer.pal(length(unique(umap_data$group)), "Dark2")
## Warning in brewer.pal(length(unique(umap_data$group)), "Dark2"): minimal value for n is 3, returning requested palette with 3 different levels
# Fig2a (left): UMAP colored by marker flag
pp1 <- ggplot(umap_data, aes(x, y)) +
geom_point(aes(color = marker), size = 0.25) +
scale_color_manual(values = mkcols) +
theme_pubr() + NoAxes() + NoLegend()
# Compute cluster centroids for labeling
centroids <- umap_data %>%
group_by(cluster) %>%
summarise(x = median(x), y = median(y))
# Fig2a (right): UMAP colored by cluster, with cluster labels
pp2 <- ggplot(umap_data, aes(x, y)) +
geom_point(aes(color = cluster), size = 0.25) +
scale_color_manual(values = ecols) +
geom_text(data = centroids, aes(label = cluster), color = "black", size = 3, fontface = "bold") +
theme_pubr() + NoAxes() + NoLegend()
# Display both UMAPs side-by-side
print(ggarrange(pp1, pp2, ncol = 2))

sink("sessionInfo.txt")
sessionInfo()
sink()